In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")

In [2]:
df = pd.read_csv("data/School_Progress_Reports_-_All_Schools_-_2009-10.csv")

In [3]:
print df.shape
df.head()


(1677, 17)
Out[3]:
DBN DISTRICT SCHOOL PRINCIPAL PROGRESS REPORT TYPE SCHOOL LEVEL* PEER INDEX* 2009-2010 OVERALL GRADE 2009-2010 OVERALL SCORE 2009-2010 ENVIRONMENT CATEGORY SCORE 2009-2010 ENVIRONMENT GRADE 2009-2010 PERFORMANCE CATEGORY SCORE 2009-2010 PERFORMANCE GRADE 2009-2010 PROGRESS CATEGORY SCORE 2009-2010 PROGRESS GRADE 2009-2010 ADDITIONAL CREDIT 2008-09 PROGRESS REPORT GRADE
0 01M015 1 P.S. 015 Roberto Clemente Thomas Staebell EMS Elementary 62.65 C 37.6 7.2 B 3.0 D 27.4 B 0.0 B
1 01M019 1 P.S. 019 Asher Levy Jacqueline Flanagan EMS Elementary 48.94 C 35.7 6.4 B 4.9 D 24.4 B 0.0 A
2 01M020 1 P.S. 020 Anna Silver James Lee EMS Elementary 57.68 A 70.3 9.3 A 7.9 C 44.1 A 9.0 A
3 01M034 1 P.S. 034 Franklin D. Roosevelt Joyce Stallings Harte EMS K-8 66.75 B 53.0 6.8 B 7.4 C 33.0 B 5.8 A
4 01M063 1 P.S. 063 William McKinley Darlene Despeignes EMS Elementary 57.55 B 54.0 7.8 B 7.1 C 35.8 A 3.3 B

In [50]:
cols = df.columns
print "\n".join("[%s] %s" % k for k in enumerate(cols))
print cols[9:16:2]
df = df.fillna(0)


[0] DBN
[1] DISTRICT
[2] SCHOOL
[3] PRINCIPAL
[4] PROGRESS REPORT TYPE
[5] SCHOOL LEVEL*
[6] PEER INDEX*
[7] 2009-2010 OVERALL GRADE
[8] 2009-2010 OVERALL SCORE
[9] 2009-2010 ENVIRONMENT CATEGORY SCORE
[10] 2009-2010 ENVIRONMENT GRADE
[11] 2009-2010 PERFORMANCE CATEGORY SCORE
[12] 2009-2010 PERFORMANCE GRADE
[13] 2009-2010 PROGRESS CATEGORY SCORE
[14] 2009-2010 PROGRESS GRADE
[15] 2009-2010 ADDITIONAL CREDIT
[16] 2008-09 PROGRESS REPORT GRADE
Index([u'2009-2010 ENVIRONMENT CATEGORY SCORE',
       u'2009-2010 PERFORMANCE CATEGORY SCORE',
       u'2009-2010 PROGRESS CATEGORY SCORE', u'2009-2010 ADDITIONAL CREDIT'],
      dtype='object')

In [51]:
class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s]" % (self.colnames,)
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self

In [53]:
pipeline = Pipeline([
        ("columns", MultiColumnExtractor(cols[9:16:2])),
        ("features_scaled", StandardScaler())
    ])


Initialized extractor for column Index([u'2009-2010 ENVIRONMENT CATEGORY SCORE',
       u'2009-2010 PERFORMANCE CATEGORY SCORE',
       u'2009-2010 PROGRESS CATEGORY SCORE', u'2009-2010 ADDITIONAL CREDIT'],
      dtype='object')

In [54]:
cluster = AgglomerativeClustering(n_clusters=2)

In [55]:
X = pipeline.fit_transform(df)
print X.shape
cids = cluster.fit_predict(X)
cols = df.columns
df_pred = pd.concat((df[df.columns[:7]], pd.DataFrame(X, columns=cols[9:16:2]), pd.Series(cids, name="ClusterID")), axis=1)
df_pred.head()


Extracting columns [Index([u'2009-2010 ENVIRONMENT CATEGORY SCORE',
       u'2009-2010 PERFORMANCE CATEGORY SCORE',
       u'2009-2010 PROGRESS CATEGORY SCORE', u'2009-2010 ADDITIONAL CREDIT'],
      dtype='object')]
(1677, 4)
Out[55]:
DBN DISTRICT SCHOOL PRINCIPAL PROGRESS REPORT TYPE SCHOOL LEVEL* PEER INDEX* 2009-2010 ENVIRONMENT CATEGORY SCORE 2009-2010 PERFORMANCE CATEGORY SCORE 2009-2010 PROGRESS CATEGORY SCORE 2009-2010 ADDITIONAL CREDIT ClusterID
0 01M015 1 P.S. 015 Roberto Clemente Thomas Staebell EMS Elementary 62.65 -0.058789 -0.965051 -0.035660 -0.998307 0
1 01M019 1 P.S. 019 Asher Levy Jacqueline Flanagan EMS Elementary 48.94 -0.287271 -0.654780 -0.250515 -0.998307 0
2 01M020 1 P.S. 020 Anna Silver James Lee EMS Elementary 57.68 0.540976 -0.164878 1.160368 2.049193 1
3 01M034 1 P.S. 034 Franklin D. Roosevelt Joyce Stallings Harte EMS K-8 66.75 -0.173030 -0.246528 0.365403 0.965637 1
4 01M063 1 P.S. 063 William McKinley Darlene Despeignes EMS Elementary 57.55 0.112572 -0.295518 0.565935 0.119110 1

In [80]:
cols = df_pred.columns
print df_pred.ClusterID.value_counts()
print "\n".join("[%s] %s" % k for k in enumerate(df_pred.columns))
xi, yi = cols[6], cols[1]
cax = plt.scatter(df_pred[xi], df_pred[yi], c=df_pred[cols[11]], s=(1+df_pred[cols[6]]), cmap="Set1")
cbar = plt.colorbar(cax, ticks=[0,1], orientation='horizontal')
cbar.ax.set_xticklabels(['Cluster 0','Cluster 1'])  # horizontal colorbar

plt.xlabel(xi)
plt.ylabel(yi)


1    1006
0     671
dtype: int64
[0] DBN
[1] DISTRICT
[2] SCHOOL
[3] PRINCIPAL
[4] PROGRESS REPORT TYPE
[5] SCHOOL LEVEL*
[6] PEER INDEX*
[7] 2009-2010 ENVIRONMENT CATEGORY SCORE
[8] 2009-2010 PERFORMANCE CATEGORY SCORE
[9] 2009-2010 PROGRESS CATEGORY SCORE
[10] 2009-2010 ADDITIONAL CREDIT
[11] ClusterID
Out[80]:
<matplotlib.text.Text at 0x7f593686d210>

In [ ]: